# This code chunk simply makes sure that all the libraries used here are installed. 
packages <- c("knitr","dplyr",  "tidyr", "caret", "ggplot2", "plotly","lubridate","leaflet", "stringr")
if ( length(missing_pkgs <- setdiff(packages, rownames(installed.packages()))) > 0) {
message("Installing missing package(s): ", paste(missing_pkgs, collapse = ", "))
install.packages(missing_pkgs)
}

Intro:

Cleaning our Data

We will combine the Issue.Date and the Issue.Time into it’s own column. To do this we will need to restucture the data into a time series friendly format.

Now there is time stamp on all the dates, “T00:00:00”, that needs to be removed.

#Removing excess information
FTP$Issue.Date <- sub("T.*", "", FTP$Issue.Date)

We will use some string processing techniqes to clean up our Issue.Time column

Let’s take a look at the data and see if there is anything else we can wrangle

We can see in Latitude and Longitude a default value of 99999 that will need to be removed.

##We can notice that there is a default of 99999 when a cordinate isn't entered. We will remove these
FTP<- FTP %>%
  filter(Latitude != 99999) 

Now for converting the cordinates from US feet to Logitutde and Latitude cordinates

#Create projection element to convert from US Feet coordinates to normal lat lon
pj <- "+proj=lcc +lat_1=34.03333333333333 +lat_2=35.46666666666667 +lat_0=33.5 +lon_0=-118 +x_0=2000000 +y_0=500000.0000000002 +ellps=GRS80 +datum=NAD83 +to_meter=0.3048006096012192 no_defs"

#Add converted latitude longitude to FTP dataframe
FTP<- cbind(FTP, data.frame(project(data.frame(FTP$Latitude, FTP$Longitude), proj = pj, inverse = TRUE)))
str(FTP)
## 'data.frame':    130342 obs. of  12 variables:
##  $ Ticket.number        : num  4.34e+09 4.34e+09 4.34e+09 4.34e+09 4.34e+09 ...
##  $ Issue.Date           : chr  "2018-12-23" "2018-12-23" "2018-12-23" "2018-12-23" ...
##  $ Issue.time           : chr  "8:30" "8:36" "8:40" "8:41" ...
##  $ Route                : chr  "340R" "340R" "340R" "340R" ...
##  $ Agency               : int  53 53 53 53 53 53 53 55 55 55 ...
##  $ Violation.code       : chr  "80.73.2" "80.56E4+" "80.69B" "80.69B" ...
##  $ Violation.Description: chr  "EXCEED 72HRS-ST" "RED ZONE" "NO PARKING" "NO PARKING" ...
##  $ Fine.amount          : int  68 93 73 73 73 73 68 363 68 73 ...
##  $ Latitude             : num  6444107 6442254 6444274 6444274 6444274 ...
##  $ Longitude            : num  1906219 1905504 1907268 1907268 1907268 ...
##  $ x                    : num  -118 -118 -118 -118 -118 ...
##  $ y                    : num  34.2 34.2 34.2 34.2 34.2 ...
FTP <- FTP[-9:-10] #This removes the Latitude and Logitude in Feet from our table
names(FTP)[c(9, 10)] <- c('Longitude', 'Latitude') #Rename column names of converted longitude latitude
# Now our data is looking clean and usable
summary(FTP)
##  Ticket.number        Issue.Date         Issue.time       
##  Min.   :1.068e+09   Length:130342      Length:130342     
##  1st Qu.:4.346e+09   Class :character   Class :character  
##  Median :4.346e+09   Mode  :character   Mode  :character  
##  Mean   :4.292e+09                                        
##  3rd Qu.:4.347e+09                                        
##  Max.   :4.348e+09                                        
##                                                           
##     Route               Agency      Violation.code    
##  Length:130342      Min.   : 1.00   Length:130342     
##  Class :character   1st Qu.:53.00   Class :character  
##  Mode  :character   Median :54.00   Mode  :character  
##                     Mean   :53.16                     
##                     3rd Qu.:55.00                     
##                     Max.   :58.00                     
##                                                       
##  Violation.Description  Fine.amount       Longitude         Latitude    
##  Length:130342         Min.   : 25.00   Min.   :-118.7   Min.   :33.71  
##  Class :character      1st Qu.: 63.00   1st Qu.:-118.4   1st Qu.:34.04  
##  Mode  :character      Median : 73.00   Median :-118.3   Median :34.06  
##                        Mean   : 70.77   Mean   :-118.4   Mean   :34.08  
##                        3rd Qu.: 73.00   3rd Qu.:-118.3   3rd Qu.:34.11  
##                        Max.   :363.00   Max.   :-118.2   Max.   :34.33  
##                        NA's   :27

After all the cleaning of our data, now we can format date and time, so we can better work with it in R

FTP$Date <- as.POSIXlt(paste(FTP$Issue.Date, FTP$Issue.time), format="%Y-%m-%d %H:%M")

Now lets identify the weekday each ticket was given and put it into a column

FTP$Weekdays <- weekdays(FTP$Date)

We can also store the hour of the day that the tickets were given and add a column

FTP$Hour <- FTP$Date$hour
summary(FTP)
##  Ticket.number        Issue.Date         Issue.time       
##  Min.   :1.068e+09   Length:130342      Length:130342     
##  1st Qu.:4.346e+09   Class :character   Class :character  
##  Median :4.346e+09   Mode  :character   Mode  :character  
##  Mean   :4.292e+09                                        
##  3rd Qu.:4.347e+09                                        
##  Max.   :4.348e+09                                        
##                                                           
##     Route               Agency      Violation.code    
##  Length:130342      Min.   : 1.00   Length:130342     
##  Class :character   1st Qu.:53.00   Class :character  
##  Mode  :character   Median :54.00   Mode  :character  
##                     Mean   :53.16                     
##                     3rd Qu.:55.00                     
##                     Max.   :58.00                     
##                                                       
##  Violation.Description  Fine.amount       Longitude         Latitude    
##  Length:130342         Min.   : 25.00   Min.   :-118.7   Min.   :33.71  
##  Class :character      1st Qu.: 63.00   1st Qu.:-118.4   1st Qu.:34.04  
##  Mode  :character      Median : 73.00   Median :-118.3   Median :34.06  
##                        Mean   : 70.77   Mean   :-118.4   Mean   :34.08  
##                        3rd Qu.: 73.00   3rd Qu.:-118.3   3rd Qu.:34.11  
##                        Max.   :363.00   Max.   :-118.2   Max.   :34.33  
##                        NA's   :27                                       
##       Date                       Weekdays              Hour      
##  Min.   :2018-12-23 00:02:00   Length:130342      Min.   : 0.00  
##  1st Qu.:2019-01-01 08:40:00   Class :character   1st Qu.: 8.00  
##  Median :2019-01-08 15:40:00   Mode  :character   Median :11.00  
##  Mean   :2019-01-08 08:29:28                      Mean   :11.29  
##  3rd Qu.:2019-01-15 14:35:00                      3rd Qu.:14.00  
##  Max.   :2019-01-23 23:57:00                      Max.   :23.00  
##  NA's   :17                                       NA's   :17
sum(is.na(FTP)) #check for how many NAs there are
## [1] 78
FTP <- na.omit(FTP)# Very few for this many observation (less than 1%)
FTP <- FTP[-11]
summary(FTP)
##  Ticket.number        Issue.Date         Issue.time       
##  Min.   :1.068e+09   Length:130298      Length:130298     
##  1st Qu.:4.346e+09   Class :character   Class :character  
##  Median :4.346e+09   Mode  :character   Mode  :character  
##  Mean   :4.293e+09                                        
##  3rd Qu.:4.347e+09                                        
##  Max.   :4.348e+09                                        
##     Route               Agency      Violation.code    
##  Length:130298      Min.   : 1.00   Length:130298     
##  Class :character   1st Qu.:53.00   Class :character  
##  Mode  :character   Median :54.00   Mode  :character  
##                     Mean   :53.17                     
##                     3rd Qu.:55.00                     
##                     Max.   :58.00                     
##  Violation.Description  Fine.amount       Longitude         Latitude    
##  Length:130298         Min.   : 25.00   Min.   :-118.7   Min.   :33.71  
##  Class :character      1st Qu.: 63.00   1st Qu.:-118.4   1st Qu.:34.04  
##  Mode  :character      Median : 73.00   Median :-118.3   Median :34.06  
##                        Mean   : 70.77   Mean   :-118.4   Mean   :34.08  
##                        3rd Qu.: 73.00   3rd Qu.:-118.3   3rd Qu.:34.11  
##                        Max.   :363.00   Max.   :-118.2   Max.   :34.33  
##    Weekdays              Hour      
##  Length:130298      Min.   : 0.00  
##  Class :character   1st Qu.: 8.00  
##  Mode  :character   Median :11.00  
##                     Mean   :11.29  
##                     3rd Qu.:14.00  
##                     Max.   :23.00

:Analysing the data First lets see the revenue they generated in a year

revenue <- sum(FTP$Fine.amount)
revenue
## [1] 9221737

Filter top 10 Violations

TopViolations <- FTP %>% 
  group_by(Violation.Description) %>% 
  tally() %>% 
  arrange(-n) %>% 
  head(10)

TopViolations

Now lets graph top 10 Violations throughout the year

#I need to make this for month and not year
TopViolationsLastYears <- FTP %>% 
  filter(Violation.Description %in%
           TopViolations$Violation.Description)


p <- ggplot(TopViolationsLastYears, aes(Issue.Date)) + 
  geom_bar(aes(fill=Violation.Description), stat='count')
#Plot the data), stat='count')
ggplotly(p)

Lets see if we can find some more patterns in the data

#This one would be better for a month
DailyParkingViolation <- FTP %>%
  group_by(Issue.Date) %>%
  tally() %>%
  ggplot(aes(x=Issue.Date, y=n)) +
  geom_point()

DailyParkingViolation

It appears there is a cloud of data points towards the top and the bottom of the graph. That is interesting, and we will further need to investigate what this could be.

table(FTP$Weekday)
## 
##    Friday    Monday  Saturday    Sunday  Thursday   Tuesday Wednesday 
##     17920     23383      2874      7501     23316     22381     32923
table(FTP$Hour)
## 
##     0     1     2     3     4     5     6     7     8     9    10    11 
##  1438  3256  3115  1990  1579  1295  1729  2490 17109  8968 16545 11670 
##    12    13    14    15    16    17    18    19    20    21    22    23 
## 16307  7935  5602  3833  6012  4222  4249  3642  2330  1785  2029  1168

We will save this table as a data frame:

WeekdayCounts = as.data.frame(table(FTP$Weekday))

Create our plot

ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1))

It will be easier to understand the data if the days are in order Lets lable our X and Y axis:

WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday","Saturday")) #We can change the Var1 variable to be an ordered factor variable
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1)) +
  xlab("Day of the Week") + ylab("Total Ticket  Given Out")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

Adding the Hour of the Day

Create a counts table for the weekday and hour:

table(FTP$Weekday, FTP$Hour)
##            
##                0    1    2    3    4    5    6    7    8    9   10   11
##   Friday     199  388  361  224  238  139  232  310 2664 1085 2483 1544
##   Monday     252  609  587  399  139  215  303  444 2716 1776 2978 2415
##   Saturday    97  129   25   19   38   31   14   23   71  117  132  165
##   Sunday     263  411  510  365  163  193  244  164  414  420  329  535
##   Thursday   152  461  511  306  272  165  269  491 3296 1877 3223 2066
##   Tuesday    240  521  491  371  243  233  309  453 2810 1455 3018 1932
##   Wednesday  235  737  630  306  486  319  358  605 5138 2238 4382 3013
##            
##               12   13   14   15   16   17   18   19   20   21   22   23
##   Friday    2514  977  784  593  738  485  483  476  323  263  288  129
##   Monday    2947 1512  978  738 1177  774  749  639  294  255  318  169
##   Saturday   160   95  107   67  222  253  274  213  164  103  196  159
##   Sunday     385  373  325  142  403  317  401  305  333  202  165  139
##   Thursday  3014 1569 1018  728 1040  725  682  426  316  266  263  180
##   Tuesday   3159 1339 1035  666  950  702  659  575  410  295  334  181
##   Wednesday 4128 2070 1355  899 1482  966 1001 1008  490  401  465  211

We will save this as a data frame

DayHourCounts = as.data.frame(table(FTP$Weekday, FTP$Hour))
DayHourCounts$Hour = as.numeric(as.character(DayHourCounts$Var2))# Convert the second variable, Var2, to numbers and call it Hour:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1))# Create out plot:

Change the colors

# Fix the order of the days:

ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2)

Separate the weekends from the weekdays:

DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"),
                            "Weekend", "Weekday")
# Redo our plot, this time coloring by Type:

ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5) 

Make a heatmap:

# Fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))

Change the label on the legend, and get rid of the y-label:

ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) +
  scale_fill_gradient(name="Total Tickets Given") + 
  theme(axis.title.y = element_blank())

Change the color scheme

ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) +
  scale_fill_gradient(name="Total Tickets Given", low="white", high="red") +
  theme(axis.title.y = element_blank())